CGC clustering¶

Install some packages¶

pip install matplotlib¶
pip install pandas¶
pip install joblib¶
pip install sklearn¶

In your terminal, type "python" to start the python.¶

Load Libraries¶

In [1]:
# for dealing with file directories
import os

# for making the plots
import matplotlib.pyplot as plt

# for dealing with dataframes
import pandas as pd

# for parallel computation
from joblib import Parallel, delayed

# for k mean clustering
from sklearn.cluster import KMeans

# for converting text sequences to features
from sklearn.feature_extraction.text import CountVectorizer

1. Prepare Data¶

In [2]:
# Set the your working directory
# os.chdir(r"/work/$your_group/$your_name/workshop_2022/CGC_clustering")
# example:
os.chdir(r"/work/yinlab/tangli/workshop_2022/CGC_clustering")
In [3]:
# Read dataframe
all_unsupervised = pd.read_csv("cgc_seq_substr_list.txt", sep="\t", header=None)
In [4]:
# Name columns for the dataframe
all_unsupervised.columns = [
    "Genome_ID",
    "CGC_ID",
    "sig_gene_seq",
    "low_level_substr",
    "Species",
]
In [5]:
all_unsupervised.head()
Out[5]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375
4 MGYG000000013 MGYG000000013_1|CGC5 GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... lactose,host glycan Bacteroides sp902362375
In [6]:
# View the shape of dataframe
all_unsupervised.shape
Out[6]:
(3352, 5)
In [7]:
# There are a few duplicates in the dataframe
all_unsupervised["sig_gene_seq"].value_counts()
Out[7]:
9.B.146,2.A.103,GT28                               24
GT28,2.A.103,9.B.146                               18
CBM50,3.A.5                                        17
1.B.14,8.A.46,GH18                                 16
3.A.5,CBM50                                        15
                                                   ..
2.A.66,GH16_3                                       1
GH95,1.B.14,GH2,GH35,GH43_10,1.B.14                 1
1.B.42,Aminotran_1_2,CBM67|GH78                     1
3.A.3,3.A.3,3.A.3,2.A.21,GerE,FecR,1.B.14,GH115     1
1.A.43,GH3,GH158,GH16_3,1.B.14                      1
Name: sig_gene_seq, Length: 2463, dtype: int64
In [8]:
# Show an example for duplicates
all_unsupervised[all_unsupervised["sig_gene_seq"] == "9.B.146,2.A.103,GT28"]
Out[8]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
10 MGYG000000013 MGYG000000013_1|CGC11 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp902362375
266 MGYG000000057 MGYG000000057_3|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp002491635
396 MGYG000000105 MGYG000000105_3|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides clarus
669 MGYG000000224 MGYG000000224_6|CGC5 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp003545565
707 MGYG000000236 MGYG000000236_3|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides fragilis_A
888 MGYG000000652 MGYG000000652_11|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides togonis
903 MGYG000000675 MGYG000000675_1|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides congonensis
1288 MGYG000001345 MGYG000001345_36|CGC11 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides xylanisolvens
1537 MGYG000001378 MGYG000001378_14|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides ovatus
1586 MGYG000001422 MGYG000001422_2|CGC10 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides oleiciplenus
1701 MGYG000001433 MGYG000001433_1|CGC39 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides salyersiae
1759 MGYG000001461 MGYG000001461_1|CGC10 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides neonati
1826 MGYG000001661 MGYG000001661_1|CGC8 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides gallinarum
1927 MGYG000002273 MGYG000002273_9|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides stercorirosoris
2392 MGYG000002549 MGYG000002549_18|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides caccae
2566 MGYG000003064 MGYG000003064_3|CGC7 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides graminisolvens
2714 MGYG000003252 MGYG000003252_545|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900761785
2739 MGYG000003312 MGYG000003312_4|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900547205
2922 MGYG000003363 MGYG000003363_45|CGC1 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900766195
3004 MGYG000003367 MGYG000003367_48|CGC5 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp007896885
3157 MGYG000004019 MGYG000004019_6|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides massiliensis_A
3195 MGYG000004185 MGYG000004185_1|CGC6 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900553815
3240 MGYG000004676 MGYG000004676_2|CGC3 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900766005
3310 MGYG000004899 MGYG000004899_9|CGC2 9.B.146,2.A.103,GT28 capsule polysaccharide Bacteroides sp900555635
In [9]:
# Remove duplicates
all_unsupervised = all_unsupervised.drop_duplicates("sig_gene_seq")
all_unsupervised.shape
Out[9]:
(2463, 5)
In [10]:
# Write out a csv file
#all_unsupervised.to_csv("unsupervised_cgc_data.tsv", index=False, sep="\t")
In [11]:
all_unsupervised.head()
Out[11]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375
4 MGYG000000013 MGYG000000013_1|CGC5 GT2,1.E.14,2.A.122,GH105,PL42,GH43_24,GH154,HT... lactose,host glycan Bacteroides sp902362375
In [12]:
# Some low-level subratrates have multiple categories in one string
# removing those
all_unsupervised["lens"] = [
    len(str(seq).split(",")) for seq in all_unsupervised["low_level_substr"]
]
In [13]:
# Keep only the clean ones (one subratrate)
all_unsupervised = all_unsupervised[all_unsupervised["lens"] == 1]
all_unsupervised
Out[13]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
... ... ... ... ... ... ...
3340 MGYG000004899 MGYG000004899_23|CGC1 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... xylan Bacteroides sp900555635 1
3342 MGYG000004899 MGYG000004899_28|CGC1 GT0,GT4,9.B.18,GT0 capsule polysaccharide Bacteroides sp900555635 1
3343 MGYG000004899 MGYG000004899_30|CGC1 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... beta-glucan Bacteroides sp900555635 1
3344 MGYG000004899 MGYG000004899_31|CGC1 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan Bacteroides sp900555635 1
3345 MGYG000004899 MGYG000004899_32|CGC1 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... rhamnogalacturonan Bacteroides sp900555635 1

1931 rows × 6 columns

In [14]:
# all_unsupervised = pd.concat([all_unsupervised_clean], ignore_index = True)
In [15]:
# Count the number of cgc sequences for each subratrate
all_unsupervised["low_level_substr"].value_counts()
Out[15]:
host glycan               346
capsule polysaccharide    201
rhamnogalacturonan        122
glycosaminoglycan         120
pectin                    108
                         ... 
fucosyllactose              1
cellobiose                  1
trehalose                   1
sorbitol                    1
beta-glucoside              1
Name: low_level_substr, Length: 64, dtype: int64
In [16]:
all_unsupervised["low_level_substr"]
Out[16]:
0                galactomannan
1                 alpha-mannan
2                  host glycan
3                       starch
5                 arabinoxylan
                 ...          
3340                     xylan
3342    capsule polysaccharide
3343               beta-glucan
3344               host glycan
3345        rhamnogalacturonan
Name: low_level_substr, Length: 1931, dtype: object
In [17]:
# Need to decide on delimeters
all_unsupervised[["sig_gene_seq"]].values[-1][0]
Out[17]:
'PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67|GH78,HTH_AraC,GH106'
In [18]:
# Treat "|" as ","
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",")
Out[18]:
'PL8_3,CBM67,GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.B.14,CBM67,GH78,HTH_AraC,GH106'
In [19]:
# Split the cgc sequences
all_unsupervised[["sig_gene_seq"]].values[-1][0].replace("|", ",").split(",")
Out[19]:
['PL8_3',
 'CBM67',
 'GH78',
 'GH28',
 'GH88',
 'GH2',
 'PL8_3',
 '8.A.46',
 '1.B.14',
 'CBM67',
 'GH78',
 'HTH_AraC',
 'GH106']
In [20]:
all_unsupervised.head()
Out[20]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
In [21]:
# check if there is missing value
all_unsupervised.isnull().sum()
# Drop missing values if possible
# all_unsupervised = all_unsupervised.dropna()
Out[21]:
Genome_ID           0
CGC_ID              0
sig_gene_seq        0
low_level_substr    0
Species             0
lens                0
dtype: int64
In [22]:
# Only keep subratrate classes which have more than 2 occurrences
count_df = pd.DataFrame(
    all_unsupervised["low_level_substr"].value_counts()[
        all_unsupervised["low_level_substr"].value_counts() >= 2
    ]
).reset_index()
In [23]:
# Get their names
to_keep_substrates = count_df["index"].values
to_keep_substrates
Out[23]:
array(['host glycan', 'capsule polysaccharide', 'rhamnogalacturonan',
       'glycosaminoglycan', 'pectin', 'alpha-mannan', 'N-glycan',
       'homogalacturonan', 'xylan', 'arabinoxylan', 'porphyran',
       'hemicellulose', 'mucin', 'acetylated glucuronoxylan',
       'beta-glucan', ' ', 'plant polysaccharide', 'xyloglucan',
       'arabinogalactan', 'fructan', 'O-antigen', 'galactomannan',
       'exopolysaccharide', 'sialoglycoconjugate', 'dextran', 'starch',
       'arabinan', 'cellulose', 'glycogen', 'carrageenan', 'alginate',
       'maltose', 'beta-mannan', 'O-glycan', 'pullulan', 'mannose',
       'alpha-glucan', 'galactan', 'sialic acid', 'ribose', 'chitin',
       'xanthan', 'arabinose', 'maltooligosaccharide',
       '4-methylumbelliferyl 6-azido-6-deoxy-beta-D-galactoside',
       'acarbose', 'glucomannan', 'glucose'], dtype=object)
In [24]:
# Subset to only those that are more than 2
all_unsupervised = all_unsupervised[
    all_unsupervised["low_level_substr"].isin(to_keep_substrates)
]
In [25]:
all_unsupervised
Out[25]:
Genome_ID CGC_ID sig_gene_seq low_level_substr Species lens
0 MGYG000000013 MGYG000000013_1|CGC1 1.B.14,GH29,GH16,GH76 galactomannan Bacteroides sp902362375 1
1 MGYG000000013 MGYG000000013_1|CGC2 1.B.14,HTH_AraC,GH92,GH76,GH130,GH92 alpha-mannan Bacteroides sp902362375 1
2 MGYG000000013 MGYG000000013_1|CGC3 GH18,8.A.46,1.B.14 host glycan Bacteroides sp902362375 1
3 MGYG000000013 MGYG000000013_1|CGC4 GH13,GH97,1.B.14 starch Bacteroides sp902362375 1
5 MGYG000000013 MGYG000000013_1|CGC6 2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_... arabinoxylan Bacteroides sp902362375 1
... ... ... ... ... ... ...
3340 MGYG000004899 MGYG000004899_23|CGC1 3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_... xylan Bacteroides sp900555635 1
3342 MGYG000004899 MGYG000004899_28|CGC1 GT0,GT4,9.B.18,GT0 capsule polysaccharide Bacteroides sp900555635 1
3343 MGYG000004899 MGYG000004899_30|CGC1 2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2... beta-glucan Bacteroides sp900555635 1
3344 MGYG000004899 MGYG000004899_31|CGC1 1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8.... host glycan Bacteroides sp900555635 1
3345 MGYG000004899 MGYG000004899_32|CGC1 PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1.... rhamnogalacturonan Bacteroides sp900555635 1

1915 rows × 6 columns

In [26]:
# view all cgc sequences in the subset
all_unsupervised["sig_gene_seq"]
Out[26]:
0                                   1.B.14,GH29,GH16,GH76
1                    1.B.14,HTH_AraC,GH92,GH76,GH130,GH92
2                                      GH18,8.A.46,1.B.14
3                                        GH13,GH97,1.B.14
5       2.A.21,GT2,2.A.1,SIS,GH115,GH95,HTH_AraC,GH43_...
                              ...                        
3340    3.A.5,GH36,HTH_AraC,GH2|CBM32,1.B.14,GH27,HTH_...
3342                                   GT0,GT4,9.B.18,GT0
3343    2.A.6,2.A.6,GH5_4,HTH_AraC,1.B.14,8.A.46,GH3,2...
3344    1.B.14,8.A.46,CBM67|GH78,GH3,FUR,3.A.3,GH31,8....
3345    PL8_3|CBM67|GH78,GH28,GH88,GH2,PL8_3,8.A.46,1....
Name: sig_gene_seq, Length: 1915, dtype: object
In [27]:
# Write selected out a csv file
#all_unsupervised.to_csv("unsupervised_cgc_selected.tsv", index=False, sep="\t")

2. Train and Test Subsets¶

In [28]:
# Train test split - Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split
In [29]:
# Train test split
# Split original data into 70% for clustering and 30% for finding the mappings
X_train, X_test, y_train, y_test = train_test_split(
    all_unsupervised["sig_gene_seq"],
    all_unsupervised["low_level_substr"],
    test_size=0.3,
    stratify=all_unsupervised["low_level_substr"].values,
)

## X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=array-like)
## test_size: the proportion of the dataset to include in the train split
## stratify: If not None, data is split in a stratified fashion, using this as the class labels.
In [30]:
# Supervised is for finding the mappings
X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
X_supervised
/tmp/ipykernel_3083247/4208520920.py:2: FutureWarning: In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only
  X_supervised = pd.concat([X_test, y_test], 1).reset_index(drop=True)
Out[30]:
sig_gene_seq low_level_substr
0 GT27,3.A.1 hemicellulose
1 1.B.14,8.A.46,GH33,GH20,GH2,GH20,GH20,1.B.14,8... sialoglycoconjugate
2 1.B.14,8.A.46,GH18,GH29,1.B.14 N-glycan
3 GT5,GH57,GT4,GH133,9.B.104,2.A.124,GH3,GH3,CE1... pectin
4 1.B.14,1.B.14,GH88 host glycan
... ... ...
570 CBM62|CBM62,TPR_1,2.A.16 4-methylumbelliferyl 6-azido-6-deoxy-beta-D-ga...
571 1.C.105,2.A.49,PL15_2 glycosaminoglycan
572 1.B.14,GH89,2.A.8,GH89 host glycan
573 GH95,2.A.51,2.A.51,GH20 sialoglycoconjugate
574 1.B.14,CBM32,1.B.14 host glycan

575 rows × 2 columns

In [31]:
# View data for clustering
X_train
Out[31]:
361                                     GH95,4.B.1,1.B.14
1143                            GH32,1.B.14,1.A.23,2.A.19
946     1.B.14,FecR,Sigma70_r2|Sigma70_r4_2,GH110,FecR...
546                                    2.A.1,GH165,1.B.14
2826                         9.B.171,PL15_2,HTH_AraC,GH88
                              ...                        
3268         GH28,GH92,GH89,HTH_AraC,CBM32|GH13_36,1.B.14
2947                                     GT26,GT32,2.A.66
3005      GH18|CE4|GT2,3.A.1,1.C.39,GT4,GT2,GT2,GT9,4.D.1
2086                             PL29,1.A.34,MerR,HD,GH23
2411                PL8_2,GH117|GH117,HTH_3,8.A.46,1.B.14
Name: sig_gene_seq, Length: 1340, dtype: object

3. Convert words to vectors¶

In [32]:
# Countvectorizer
# To convert a collection of text documents to a vector of term/token counts
count_vectorizer = CountVectorizer(
    tokenizer=lambda x: str(x).replace("|", ",").split(","), lowercase=False
)
In [33]:
# Fit the count vectorizer (learn a vocabulary dictionary of all tokens in the raw documents)
count_vectorizer.fit(X_train)
/util/opt/anaconda/deployed-conda-envs/packages/biopython/envs/biopython-1.79-py39/lib/python3.9/site-packages/sklearn/feature_extraction/text.py:489: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn("The parameter 'token_pattern' will not be used"
Out[33]:
CountVectorizer(lowercase=False,
                tokenizer=<function <lambda> at 0x151e86752a60>)
In [34]:
# Transform texts to sequences
count_vectorized_array = count_vectorizer.transform(X_train)
In [35]:
count_vectorized_array
# count_vectorized_array.toarray()
Out[35]:
<1340x420 sparse matrix of type '<class 'numpy.int64'>'
	with 7089 stored elements in Compressed Sparse Row format>
In [36]:
# vocabulary - dictionary containing the word (key) and vector (value)
vocabulary = count_vectorizer.vocabulary_
vocabulary
# View
first20vocabulary = {k: vocabulary[k] for k in sorted(vocabulary.keys())[:20]}
first20vocabulary
Out[36]:
{'1.A.1': 0,
 '1.A.11': 1,
 '1.A.13': 2,
 '1.A.22': 3,
 '1.A.23': 4,
 '1.A.26': 5,
 '1.A.28': 6,
 '1.A.30': 7,
 '1.A.33': 8,
 '1.A.34': 9,
 '1.A.35': 10,
 '1.A.62': 11,
 '1.A.78': 12,
 '1.A.8': 13,
 '1.B.14': 14,
 '1.B.17': 15,
 '1.B.18': 16,
 '1.B.42': 17,
 '1.B.44': 18,
 '1.B.5': 19}
In [37]:
# Inverse vocabulary
inv_map_vocab = {v: k for k, v in vocabulary.items()}
# View
first20_inv_vocabulary = {
    k: inv_map_vocab[k] for k in sorted(inv_map_vocab.keys())[:20]
}
first20_inv_vocabulary
Out[37]:
{0: '1.A.1',
 1: '1.A.11',
 2: '1.A.13',
 3: '1.A.22',
 4: '1.A.23',
 5: '1.A.26',
 6: '1.A.28',
 7: '1.A.30',
 8: '1.A.33',
 9: '1.A.34',
 10: '1.A.35',
 11: '1.A.62',
 12: '1.A.78',
 13: '1.A.8',
 14: '1.B.14',
 15: '1.B.17',
 16: '1.B.18',
 17: '1.B.42',
 18: '1.B.44',
 19: '1.B.5'}

4. CGC Clustering¶

In [38]:
# Affinity Propagation - a machine learning algorithm that can find data centers or clusters
# by sending messages between pairs of data points.
# does not need to know number of clusters
from sklearn.cluster import AffinityPropagation
In [39]:
# Fit the clustering algorithm on the unsupervised data
clustering = AffinityPropagation(random_state=5).fit(count_vectorized_array)
# random_state: pseudo-random number generator to control the starting state.
In [40]:
clustering.labels_
Out[40]:
array([ 71,  33, 136, ...,  82,  58,  18])
In [41]:
# For counting
# Counter: it is a collection where elements are stored as dictionary keys
# and their counts are stored as dictionary values.
from collections import Counter
In [42]:
# Count for number of clusters
cluster_freq_info = pd.DataFrame(Counter(clustering.labels_), index=[0]).T.reset_index()
# T: Transpose index and columns
# view
cluster_freq_info.shape
Out[42]:
(156, 2)
In [43]:
# Name columns and view
cluster_freq_info.columns = ["cluster_id", "number of samples"]
cluster_freq_info.head()
Out[43]:
cluster_id number of samples
0 71 21
1 33 29
2 136 4
3 91 20
4 137 4
In [44]:
# Select the top clusters for visualization
cluster_freq_info = cluster_freq_info.sort_values("number of samples", ascending=False)
In [45]:
cluster_freq_info
Out[45]:
cluster_id number of samples
7 134 67
49 38 54
24 57 46
10 149 38
1 33 29
... ... ...
127 63 1
100 31 1
129 66 1
61 15 1
155 153 1

156 rows × 2 columns

In [46]:
# We will create clusters to substrate mappings,
# convert strings to vector,
# pass the supervised strings through the count vectorizer

supervised_seqs_array = count_vectorizer.transform(X_supervised["sig_gene_seq"].values)
In [47]:
# supervised_seqs_array
supervised_seqs_array_dense = supervised_seqs_array.todense()
supervised_seqs_array_dense
Out[47]:
matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])
In [48]:
# Use the fit cluster object to predict clusters on the supervised set
supervised_set_clusters = clustering.predict(supervised_seqs_array_dense)
In [49]:
# Start to create the mapping
mapping_df = pd.DataFrame(X_supervised["low_level_substr"].values)
In [50]:
# Put the predicted clusters
mapping_df["cluster_id"] = supervised_set_clusters
In [51]:
# Sort and name the columns
mapping_df = mapping_df.sort_values("cluster_id")
mapping_df.columns = ["low_level_substr", "cluster_id"]
mapping_df
Out[51]:
low_level_substr cluster_id
118 host glycan 0
410 N-glycan 0
413 capsule polysaccharide 0
328 alginate 0
497 exopolysaccharide 0
... ... ...
126 exopolysaccharide 155
81 exopolysaccharide 155
252 dextran 155
238 arabinoxylan 155
150 host glycan 155

575 rows × 2 columns

In [52]:
# Aggregate substrate by using the cluster id
mapping_df = pd.DataFrame(
    mapping_df.groupby("cluster_id")["low_level_substr"].aggregate(pd.Series.mode)
).reset_index()
In [53]:
# Remove [] from low_level_substr column
catch = []
for seq in mapping_df["low_level_substr"].values:
    seq = str(seq).replace("]", "").replace("[", "")
    catch.append(seq)
In [54]:
mapping_df["low_level_substr"] = catch
mapping_df
Out[54]:
cluster_id low_level_substr
0 0 host glycan
1 1 beta-mannan
2 2 host glycan
3 3 arabinan
4 4 host glycan
... ... ...
114 149 alpha-mannan
115 151 capsule polysaccharide
116 152 host glycan
117 154 capsule polysaccharide
118 155 exopolysaccharide

119 rows × 2 columns

5. Visualization¶

In [55]:
# TSNE for visualization
# T-distributed Stochastic Neighbor Embedding - visualize high-dimensional data.
from sklearn.manifold import TSNE
In [56]:
# Covert sparse matrix
count_vectorized_array
Out[56]:
<1340x420 sparse matrix of type '<class 'numpy.int64'>'
	with 7089 stored elements in Compressed Sparse Row format>
In [57]:
# 3D plot
X_embedded = TSNE(
    n_components=3, init="pca"  # learning_rate='auto',init='random'
).fit_transform(count_vectorized_array.toarray())

# n_components: dimension of the embedded space.
# init: initialization of embedding.
# fit_transform: fit X into an embedded space and return that transformed output.
In [58]:
# View dataframe
X_embedded_df = pd.DataFrame(X_embedded)
X_embedded_df
Out[58]:
0 1 2
0 6.559994 -1.155289 -4.756688
1 -33.427830 -8.903884 45.734745
2 21.595331 50.405891 11.839166
3 9.262816 21.649462 -19.761250
4 -13.136441 32.683598 -59.671074
... ... ... ...
1335 26.730934 -46.706142 -48.666470
1336 -38.275902 19.213551 2.894465
1337 -64.072372 7.265289 -35.391850
1338 -21.172161 49.175171 39.198654
1339 46.532822 -42.846436 -32.027657

1340 rows × 3 columns

In [59]:
# Add cluster_id column to dataframe
X_embedded_df["cluster_id"] = clustering.labels_
X_embedded_df
Out[59]:
0 1 2 cluster_id
0 6.559994 -1.155289 -4.756688 71
1 -33.427830 -8.903884 45.734745 33
2 21.595331 50.405891 11.839166 136
3 9.262816 21.649462 -19.761250 91
4 -13.136441 32.683598 -59.671074 137
... ... ... ... ...
1335 26.730934 -46.706142 -48.666470 149
1336 -38.275902 19.213551 2.894465 77
1337 -64.072372 7.265289 -35.391850 82
1338 -21.172161 49.175171 39.198654 58
1339 46.532822 -42.846436 -32.027657 18

1340 rows × 4 columns

In [60]:
# Merge mapping_df with X_embedded_df by cluster_id
X_embedded_df = X_embedded_df.merge(mapping_df, how="left", on="cluster_id")
In [61]:
X_embedded_df
Out[61]:
0 1 2 cluster_id low_level_substr
0 6.559994 -1.155289 -4.756688 71 'arabinoxylan' 'glycosaminoglycan' 'host glycan'
1 -33.427830 -8.903884 45.734745 33 host glycan
2 21.595331 50.405891 11.839166 136 host glycan
3 9.262816 21.649462 -19.761250 91 'exopolysaccharide' 'host glycan'
4 -13.136441 32.683598 -59.671074 137 glycosaminoglycan
... ... ... ... ... ...
1335 26.730934 -46.706142 -48.666470 149 alpha-mannan
1336 -38.275902 19.213551 2.894465 77 'capsule polysaccharide' 'carrageenan'
1337 -64.072372 7.265289 -35.391850 82 capsule polysaccharide
1338 -21.172161 49.175171 39.198654 58 alginate
1339 46.532822 -42.846436 -32.027657 18 glycosaminoglycan

1340 rows × 5 columns

In [62]:
# View top 10 substrates
X_embedded_df["low_level_substr"].value_counts().head(n=10)
Out[62]:
host glycan                  193
capsule polysaccharide       111
rhamnogalacturonan           106
glycosaminoglycan            105
alpha-mannan                  72
homogalacturonan              55
beta-glucan                   54
'N-glycan' 'host glycan'      46
acetylated glucuronoxylan     36
xylan                         29
Name: low_level_substr, dtype: int64
In [63]:
# Select top 10 substrates
top_10_classes = X_embedded_df["low_level_substr"].value_counts()[:10]
top_10_classes
Out[63]:
host glycan                  193
capsule polysaccharide       111
rhamnogalacturonan           106
glycosaminoglycan            105
alpha-mannan                  72
homogalacturonan              55
beta-glucan                   54
'N-glycan' 'host glycan'      46
acetylated glucuronoxylan     36
xylan                         29
Name: low_level_substr, dtype: int64
In [64]:
# Make top 10 substrates to list
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)
top_10_classes_list
Out[64]:
['host glycan',
 'capsule polysaccharide',
 'rhamnogalacturonan',
 'glycosaminoglycan',
 'alpha-mannan',
 'homogalacturonan',
 'beta-glucan',
 "'N-glycan' 'host glycan'",
 'acetylated glucuronoxylan',
 'xylan']
In [65]:
# Subset X_embedded_df dataframe by list
X_embedded_df_viz = X_embedded_df[
    X_embedded_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_df_viz
Out[65]:
0 1 2 cluster_id low_level_substr
1 -33.427830 -8.903884 45.734745 33 host glycan
2 21.595331 50.405891 11.839166 136 host glycan
4 -13.136441 32.683598 -59.671074 137 glycosaminoglycan
5 -23.056034 -4.087102 29.942402 33 host glycan
7 -65.945145 30.958498 13.941587 151 capsule polysaccharide
... ... ... ... ... ...
1330 -53.821823 24.164757 -6.134601 154 capsule polysaccharide
1334 42.950474 -31.631836 -29.714808 99 glycosaminoglycan
1335 26.730934 -46.706142 -48.666470 149 alpha-mannan
1337 -64.072372 7.265289 -35.391850 82 capsule polysaccharide
1339 46.532822 -42.846436 -32.027657 18 glycosaminoglycan

807 rows × 5 columns

In [66]:
# astype:change a pandas object to a specified dtype
X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
/tmp/ipykernel_3083247/3803832902.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_embedded_df_viz["low_level_substr"] = X_embedded_df_viz["low_level_substr"].astype(str)
In [67]:
# plotly - an interactive, open-source, and browser-based graphing library
import plotly.express as px
In [68]:
fig = px.scatter_3d(X_embedded_df_viz, x=0, y=1, z=2, color="low_level_substr")
#fig.write_html("workshop_clusters.html")
In [69]:
# view plot
fig.show()
In [70]:
# view X_embedded_df dataframe
X_embedded_df.shape
Out[70]:
(1340, 5)
In [71]:
# Creat a new dataframe - summarize cgc sequences, cluster ids and substrates
final_df_cgc_cluster_substrate = X_embedded_df[["low_level_substr", "cluster_id"]]
In [72]:
final_df_cgc_cluster_substrate["sig_gene_seq"] = X_train.values
final_df_cgc_cluster_substrate["sig_gene_seq"]
/tmp/ipykernel_3083247/3070682039.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[72]:
0                                       GH95,4.B.1,1.B.14
1                               GH32,1.B.14,1.A.23,2.A.19
2       1.B.14,FecR,Sigma70_r2|Sigma70_r4_2,GH110,FecR...
3                                      2.A.1,GH165,1.B.14
4                            9.B.171,PL15_2,HTH_AraC,GH88
                              ...                        
1335         GH28,GH92,GH89,HTH_AraC,CBM32|GH13_36,1.B.14
1336                                     GT26,GT32,2.A.66
1337      GH18|CE4|GT2,3.A.1,1.C.39,GT4,GT2,GT2,GT9,4.D.1
1338                             PL29,1.A.34,MerR,HD,GH23
1339                PL8_2,GH117|GH117,HTH_3,8.A.46,1.B.14
Name: sig_gene_seq, Length: 1340, dtype: object
In [73]:
# Name the columns
final_df_cgc_cluster_substrate[["sig_gene_seq", "cluster_id", "low_level_substr"]]
Out[73]:
sig_gene_seq cluster_id low_level_substr
0 GH95,4.B.1,1.B.14 71 'arabinoxylan' 'glycosaminoglycan' 'host glycan'
1 GH32,1.B.14,1.A.23,2.A.19 33 host glycan
2 1.B.14,FecR,Sigma70_r2|Sigma70_r4_2,GH110,FecR... 136 host glycan
3 2.A.1,GH165,1.B.14 91 'exopolysaccharide' 'host glycan'
4 9.B.171,PL15_2,HTH_AraC,GH88 137 glycosaminoglycan
... ... ... ...
1335 GH28,GH92,GH89,HTH_AraC,CBM32|GH13_36,1.B.14 149 alpha-mannan
1336 GT26,GT32,2.A.66 77 'capsule polysaccharide' 'carrageenan'
1337 GH18|CE4|GT2,3.A.1,1.C.39,GT4,GT2,GT2,GT9,4.D.1 82 capsule polysaccharide
1338 PL29,1.A.34,MerR,HD,GH23 58 alginate
1339 PL8_2,GH117|GH117,HTH_3,8.A.46,1.B.14 18 glycosaminoglycan

1340 rows × 3 columns

In [74]:
# View substrate by using cluster id
mapping_df[mapping_df["cluster_id"] == 0]
Out[74]:
cluster_id low_level_substr
0 0 host glycan
In [75]:
# View number of sequences by using cluster id
cluster_freq_info[cluster_freq_info["cluster_id"] == 68]
Out[75]:
cluster_id number of samples
25 68 3
In [76]:
final_df_cgc_cluster_substrate["cluster_id"].value_counts()[:10]
Out[76]:
134    67
38     54
57     46
149    38
33     29
61     28
92     27
62     27
14     26
60     24
Name: cluster_id, dtype: int64
In [77]:
# Write out a csv file
#final_df_cgc_cluster_substrate.to_csv("cgc_cluster_substrate.tsv", index=False, sep="\t")
In [78]:
# 2D plot
X_embedded_2D = TSNE(n_components=2, init="pca").fit_transform(
    count_vectorized_array.toarray()
)

X_embedded_2D_df = pd.DataFrame(X_embedded_2D)
X_embedded_2D_df["cluster_id"] = clustering.labels_
X_embedded_2D_df = X_embedded_2D_df.merge(mapping_df, how="left", on="cluster_id")
X_embedded_2D_df
Out[78]:
0 1 cluster_id low_level_substr
0 5.919950 5.166234 71 'arabinoxylan' 'glycosaminoglycan' 'host glycan'
1 -6.791799 6.786656 33 host glycan
2 8.243697 27.787916 136 host glycan
3 6.899367 11.483501 91 'exopolysaccharide' 'host glycan'
4 -28.028196 17.804018 137 glycosaminoglycan
... ... ... ... ...
1335 30.564066 21.215761 149 alpha-mannan
1336 -37.680904 6.757950 77 'capsule polysaccharide' 'carrageenan'
1337 -49.747913 23.629011 82 capsule polysaccharide
1338 -31.859072 23.341862 58 alginate
1339 47.376091 -6.875403 18 glycosaminoglycan

1340 rows × 4 columns

In [79]:
top_10_classes = X_embedded_2D_df["low_level_substr"].value_counts()[:10]
top_10_classes_list = list(pd.DataFrame(top_10_classes).index)

X_embedded_2D_df_viz = X_embedded_df[
    X_embedded_2D_df["low_level_substr"].isin(top_10_classes_list)
]
X_embedded_2D_df_viz["low_level_substr"] = X_embedded_2D_df_viz[
    "low_level_substr"
].astype(str)

fig = px.scatter(X_embedded_2D_df_viz, x=0, y=1, color="low_level_substr")
#fig.write_html("workshop_clusters_2D_10top.html")
fig.show()
/tmp/ipykernel_3083247/2587830699.py:7: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [ ]: